These examples show how we can use Python to load text from a file and use code to split it into paragraphs, sentances, and words. In Python, text is represented in the string
format which is basically a list of character objects. We can use special Python libraries such as re
to work with string objects. We can also use Python's collections
library to find unique words and characters and count their occurances in a piece of text.
In [ ]:
# first we import the 're' library which allows us to work with and format string objects in different ways
import re
In [ ]:
filename = "data/wonderland.txt"
raw_text = open(filename).read()
# get rid of any characters other than letters, numbers,
# and a few special characters
raw_text = re.sub('[^\nA-Za-z0-9 ,.:;?!-]+', '', raw_text)
n_chars = len(raw_text)
print("length of text:", n_chars)
In [ ]:
paragraphs = raw_text.split('\n')
paragraphs = [p for p in paragraphs if len(p) > 0]
print("number of paragraphs:", len(paragraphs))
words = []
for p in paragraphs:
words += re.sub('[^A-Za-z ]+', '', p).split(" ")
print("number of words:", len(words))
letters = []
for w in words:
letters += w
print("number of letters:", len(letters))
In [ ]:
# here we import the 'collections' library which allows us to count unique objects in a list of data
# https://docs.python.org/3/library/collections.html#counter-objects
import collections
wordSet = collections.Counter(words)
print("most common words:", wordSet.most_common(10))
uniqueWords = list(wordSet)
print("unique words:", len(uniqueWords))
letterSet = collections.Counter(letters)
print("most common letters:", letterSet.most_common(10))